In [1]:
# note: this also imports numpy as np, imports matplotlib.pyplot as plt, and others
%pylab inline


Populating the interactive namespace from numpy and matplotlib

Load Arff file into Scikit-learn

Source: Stackoverflow.com


In [2]:
from sklearn.ensemble import RandomForestClassifier
from scipy.io.arff import loadarff

import scipy as sp
import numpy as np

dataset = loadarff(open('iris.arff','r'))
target = np.array(dataset[0]['class'])
train = np.array(dataset[0][['sepallength', 'sepalwidth', 'petallength', 'petalwidth']])
train = np.asarray(train.tolist(), dtype=np.float32)
rf = RandomForestClassifier(n_estimators = 20, n_jobs = 8)
rf.fit(train, target)
print(rf)

# -----------------------
arffFile = 'iris.arff'  # get it here http://www.cas.mcmaster.ca/~cs4tf3/iris.arff
# -----------------------

with open(arffFile,'r') as f:
    data, meta = loadarff(f)

print(type(data)) # <class 'numpy.ndarray'> 
print(type(meta)) # <class 'scipy.io.arff.arffread.MetaData'>

train = data[meta.names()[:-1]] #everything but the last column
train = train.view(np.float).reshape(data.shape + (-1,)) #converts the record array to a normal numpy array


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
<class 'numpy.ndarray'>
<class 'scipy.io.arff.arffread.MetaData'>